林嶔 (Lin, Chin)
Lesson 2
– 陣列(Array)層分為矩陣(matrix)及資料表(data.frame):
矩陣(matrix)要求裡面的每個元素都為單一種類,因此與變數(Variable)層的向量類似,分為邏輯(logical)矩陣、整數(integer)矩陣、數字(numeric)矩陣、文字(character)矩陣[注意:因子(factor)向量被轉為矩陣時會變為文字矩陣]
資料表(data.frame)可以允許每欄有不同的屬性,一個資料表內可以同時擁有不同屬性的變項。
# 數値矩陣物件(注意將數字填入矩陣時的順序)
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
M1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
# 因子矩陣物件
x2 = c(0, 1, 2, 0, 2, 1, 1, 2, 0)
x2 = as.factor(x2)
M2 = matrix(x2, nrow = 3, ncol = 3)
M2
函數「t()」可以求得轉置矩陣
函數「solve()」可以求得反矩陣
矩陣的乘法需要利用函數「%*%」來完成
x = 1:4
X = matrix(x, nrow = 2, ncol = 2)
X
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
t(X)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
solve(X)
## [,1] [,2]
## [1,] -2 1.5
## [2,] 1 -0.5
y = 2:5
Y = matrix(y, nrow = 2, ncol = 2)
Y
## [,1] [,2]
## [1,] 2 4
## [2,] 3 5
#注意下列兩者的差異
X%*%Y
## [,1] [,2]
## [1,] 11 19
## [2,] 16 28
X*Y
## [,1] [,2]
## [1,] 2 12
## [2,] 6 20
函數「length()」可以仍然可以用來查詢此陣列的總長度(長*寬)
函數「nrow()」、「ncol()」可以查詢陣列的列欄數目
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
length(M1)
## [1] 20
nrow(M1)
## [1] 4
ncol(M1)
## [1] 5
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
M1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
# 猜猜這些指令是叫出哪個數字
M1[3,4]
M1[2,5]
M1[4,1]
# 也猜猜這些指令是叫出哪個東西
M1[3,]
M1[,1]
M1[,c(2, 5)]
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
rownames(M1) = c("a", "b", "c", "d")
M1
## [,1] [,2] [,3] [,4] [,5]
## a 1 5 9 13 17
## b 2 6 10 14 18
## c 3 7 11 15 19
## d 4 8 12 16 20
colnames(M1) = c("A", "B", "C", "D", "E")
M1
## A B C D E
## a 1 5 9 13 17
## b 2 6 10 14 18
## c 3 7 11 15 19
## d 4 8 12 16 20
M1["c","A"]
M1[,c("E", "C")]
colnames(M1)[3] = "ZZZ"
M1
## A B ZZZ D E
## a 1 5 9 13 17
## b 2 6 10 14 18
## c 3 7 11 15 19
## d 4 8 12 16 20
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
D1 = data.frame(M1)
D1
## X1 X2 X3 X4 X5
## 1 1 5 9 13 17
## 2 2 6 10 14 18
## 3 3 7 11 15 19
## 4 4 8 12 16 20
rownames(M1) = c("a", "b", "c", "d")
colnames(M1) = c("A", "B", "C", "D", "E")
D2 = data.frame(M1)
D2
## A B C D E
## a 1 5 9 13 17
## b 2 6 10 14 18
## c 3 7 11 15 19
## d 4 8 12 16 20
colnames(D1) = c("A", "B", "C", "D", "E")
D1
## A B C D E
## 1 1 5 9 13 17
## 2 2 6 10 14 18
## 3 3 7 11 15 19
## 4 4 8 12 16 20
a = c(TRUE, FALSE, TRUE, FALSE, FALSE)
class(a)
## [1] "logical"
b = c(0.7, -0.9, 1.2, -2.1, 3.7)
class(b)
## [1] "numeric"
c = c("A", "B", "C", "C", "B")
c = as.factor(c)
class(c)
## [1] "factor"
DATA = data.frame(a, b, c)
DATA
## a b c
## 1 TRUE 0.7 A
## 2 FALSE -0.9 B
## 3 TRUE 1.2 C
## 4 FALSE -2.1 C
## 5 FALSE 3.7 B
class(DATA[,1])
## [1] "logical"
class(DATA[,2])
## [1] "numeric"
class(DATA[,3])
## [1] "factor"
DATA$b
## [1] 0.7 -0.9 1.2 -2.1 3.7
DATA[,2]
## [1] 0.7 -0.9 1.2 -2.1 3.7
DATA[,"b"]
## [1] 0.7 -0.9 1.2 -2.1 3.7
a = c(TRUE, FALSE, TRUE, FALSE, FALSE)
b = c(0.7, -0.9, 1.2, -2.1, 3.7)
c = c("A", "B", "C", "C", "B")
DATA = data.frame(a, b, c)
DATA
## a b c
## 1 TRUE 0.7 A
## 2 FALSE -0.9 B
## 3 TRUE 1.2 C
## 4 FALSE -2.1 C
## 5 FALSE 3.7 B
DATA$d = c(1, 5, 8, 9, 12)
DATA
## a b c d
## 1 TRUE 0.7 A 1
## 2 FALSE -0.9 B 5
## 3 TRUE 1.2 C 8
## 4 FALSE -2.1 C 9
## 5 FALSE 3.7 B 12
x = 1:4
X = matrix(x, nrow = 2, ncol = 2)
y = 2:5
Y = matrix(y, nrow = 2, ncol = 2)
rbind(X, Y)
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
## [3,] 2 4
## [4,] 3 5
rbind(Y, X)
## [,1] [,2]
## [1,] 2 4
## [2,] 3 5
## [3,] 1 3
## [4,] 2 4
cbind(X, Y)
## [,1] [,2] [,3] [,4]
## [1,] 1 3 2 4
## [2,] 2 4 3 5
x = 1:4
y = 6:8
# 問題,如何合併它們並儲存成物件z呢?
做為一個統計程式語言,R支援了多種統計檢定功能,但在執行這些功能的第一步,是要將檔案讀進R的工作環境裡面。
請按這裡下載等等要讀取的檔案
– 這個檔案是105年4月25日早上5點30台北市各監測站各空氣汙染物濃度的檔案。
– 使用函數「read.csv()」讀取csv檔
dat = read.csv("data2_1.csv", header = TRUE, fileEncoding = 'CP950') #請將路徑改為自己放置的位置
請注意,在這邊我們第一次面對到這種比較複雜的函數,注意到上面那串程式碼在小括號內有一個【header = TRUE】的指令,他是在說讀取檔案時,該檔案的首列為『欄位名稱』。
由於之後面對的函數可能會有極多的input,因此要查看這些函數能更改什麼參數的時候,可以使用函數「help()」
help(read.csv)
?read.csv #也可以這樣查詢
class(dat)
## [1] "data.frame"
head(dat)
## time device_id s_d0 s_t0 s_h0 lat lon
## 1 2016-04-25 04:42:16 28C2DDDD4505 17 29.75 77 24.99643 121.5483
## 2 2016-04-25 05:25:11 28C2DDDD450C 27 27.50 87 25.05221 121.5637
## 3 2016-04-25 05:23:57 28C2DDDD4534 27 30.00 65 25.08374 121.5070
## 4 2016-04-25 05:25:13 28C2DDDD47C6 30 26.12 93 25.01992 121.5305
## 5 2016-04-25 05:24:48 28C2DDDD4234 30 28.00 83 25.00244 121.5519
## 6 2016-04-25 05:24:48 28C2DDDD4234 30 28.00 83 25.00244 121.5519
## school time2
## 1 景興國小 2016/4/25 5:30
## 2 西松國小 2016/4/25 5:30
## 3 葫蘆國小 2016/4/25 5:30
## 4 溫州國宅 2016/4/25 5:30
## 5 興隆國小 2016/4/25 5:30
## 6 興隆國小 2016/4/25 5:30
summary(dat)
## time device_id s_d0
## 2016-04-25 05:23:46: 2 28C2DDDD41FA: 2 Min. :17.00
## 2016-04-25 05:24:00: 2 28C2DDDD4234: 2 1st Qu.:31.50
## 2016-04-25 05:24:48: 2 28C2DDDD400A: 1 Median :35.50
## 2016-04-25 04:42:16: 1 28C2DDDD41B2: 1 Mean :34.86
## 2016-04-25 05:23:45: 1 28C2DDDD41C0: 1 3rd Qu.:38.25
## 2016-04-25 05:23:54: 1 28C2DDDD41EB: 1 Max. :43.00
## (Other) :19 (Other) :20
## s_t0 s_h0 lat lon
## Min. :26.12 Min. : 65.00 Min. :24.99 Min. :121.5
## 1st Qu.:27.75 1st Qu.: 84.00 1st Qu.:25.03 1st Qu.:121.5
## Median :28.00 Median : 88.00 Median :25.05 Median :121.5
## Mean :28.10 Mean : 89.39 Mean :25.05 Mean :121.5
## 3rd Qu.:28.53 3rd Qu.: 89.75 3rd Qu.:25.07 3rd Qu.:121.6
## Max. :30.00 Max. :151.00 Max. :25.13 Max. :121.6
##
## school time2
## 民生國小: 2 2016/4/25 5:30:28
## 興隆國小: 2
## 仁愛國小: 1
## 內湖國小: 1
## 南湖國小: 1
## 古亭國小: 1
## (Other) :20
在使用函數「head()」時我們湊巧發現了一件事情,那就是第5列和第6列似乎重複了,而這個檔案似乎有不只一處的重複資料,因此我們要開始做資料清理
在教大家簡單的資料清理函數之前,由於我們上一節課已經教過了迴圈功能,我希望大家先用迴圈功能檢查這份資料有多少筆重複資料,請將下面的清理方式寫成程式碼
註:我們在確認時可能會用到一個新函數「%in%」,它的功能是確認左邊的物件是否有在右邊的物件中出現過,如下
dat[5,"school"]
## [1] 興隆國小
## 26 Levels: 仁愛國小 內湖國小 南湖國小 ... 西門國小
dat[1:4,"school"]
## [1] 景興國小 西松國小 葫蘆國小 溫州國宅
## 26 Levels: 仁愛國小 內湖國小 南湖國小 ... 西門國小
dat[5,"school"] %in% dat[1:4,"school"]
## [1] FALSE
dat[6,"school"]
## [1] 興隆國小
## 26 Levels: 仁愛國小 內湖國小 南湖國小 ... 西門國小
dat[1:5,"school"]
## [1] 景興國小 西松國小 葫蘆國小 溫州國宅 興隆國小
## 26 Levels: 仁愛國小 內湖國小 南湖國小 ... 西門國小
dat[6,"school"] %in% dat[1:5,"school"]
## [1] TRUE
i = 2
dat[i,"school"] %in% dat[1:(i-1),"school"]
i = 3
dat[i,"school"] %in% dat[1:(i-1),"school"]
i = 4
dat[i,"school"] %in% dat[1:(i-1),"school"]
i = 5
dat[i,"school"] %in% dat[1:(i-1),"school"]
i = 6
dat[i,"school"] %in% dat[1:(i-1),"school"]
dat$DUP = FALSE # 也可以用「dat[,'DUP'] = FALSE」新增
for (i in 2:nrow(dat)) {
dat[i,'DUP'] = dat[i,"school"] %in% dat[1:(i-1),"school"]
}
head(dat)
## time device_id s_d0 s_t0 s_h0 lat lon
## 1 2016-04-25 04:42:16 28C2DDDD4505 17 29.75 77 24.99643 121.5483
## 2 2016-04-25 05:25:11 28C2DDDD450C 27 27.50 87 25.05221 121.5637
## 3 2016-04-25 05:23:57 28C2DDDD4534 27 30.00 65 25.08374 121.5070
## 4 2016-04-25 05:25:13 28C2DDDD47C6 30 26.12 93 25.01992 121.5305
## 5 2016-04-25 05:24:48 28C2DDDD4234 30 28.00 83 25.00244 121.5519
## 6 2016-04-25 05:24:48 28C2DDDD4234 30 28.00 83 25.00244 121.5519
## school time2 DUP
## 1 景興國小 2016/4/25 5:30 FALSE
## 2 西松國小 2016/4/25 5:30 FALSE
## 3 葫蘆國小 2016/4/25 5:30 FALSE
## 4 溫州國宅 2016/4/25 5:30 FALSE
## 5 興隆國小 2016/4/25 5:30 FALSE
## 6 興隆國小 2016/4/25 5:30 TRUE
– 函數「duplicated()」可以直接找尋整個資料表是否有重複的列
dat = read.csv("data2_1.csv", header = TRUE, fileEncoding = 'CP950') #請將路徑改為自己放置的位置
dat$DUP1 = duplicated(dat)
dat[,c("school", "DUP1")]
## school DUP1
## 1 景興國小 FALSE
## 2 西松國小 FALSE
## 3 葫蘆國小 FALSE
## 4 溫州國宅 FALSE
## 5 興隆國小 FALSE
## 6 興隆國小 TRUE
## 7 萬大國小 FALSE
## 8 舊莊國小 FALSE
## 9 台北市立大學附設實驗國小 FALSE
## 10 古亭國小 FALSE
## 11 永建國小 FALSE
## 12 福林國小 FALSE
## 13 清江國小 FALSE
## 14 富安國小 FALSE
## 15 敦化國小 FALSE
## 16 潭美國小 FALSE
## 17 西門國小 FALSE
## 18 仁愛國小 FALSE
## 19 文湖國小 FALSE
## 20 大佳國小 FALSE
## 21 吉林國小 FALSE
## 22 懷生國小 FALSE
## 23 大同國小 FALSE
## 24 內湖國小 FALSE
## 25 民生國小 FALSE
## 26 立農國小 FALSE
## 27 民生國小 TRUE
## 28 南湖國小 FALSE
dat$DUP2 = duplicated(dat$school)
dat[,c("school", "DUP1", "DUP2")]
## school DUP1 DUP2
## 1 景興國小 FALSE FALSE
## 2 西松國小 FALSE FALSE
## 3 葫蘆國小 FALSE FALSE
## 4 溫州國宅 FALSE FALSE
## 5 興隆國小 FALSE FALSE
## 6 興隆國小 TRUE TRUE
## 7 萬大國小 FALSE FALSE
## 8 舊莊國小 FALSE FALSE
## 9 台北市立大學附設實驗國小 FALSE FALSE
## 10 古亭國小 FALSE FALSE
## 11 永建國小 FALSE FALSE
## 12 福林國小 FALSE FALSE
## 13 清江國小 FALSE FALSE
## 14 富安國小 FALSE FALSE
## 15 敦化國小 FALSE FALSE
## 16 潭美國小 FALSE FALSE
## 17 西門國小 FALSE FALSE
## 18 仁愛國小 FALSE FALSE
## 19 文湖國小 FALSE FALSE
## 20 大佳國小 FALSE FALSE
## 21 吉林國小 FALSE FALSE
## 22 懷生國小 FALSE FALSE
## 23 大同國小 FALSE FALSE
## 24 內湖國小 FALSE FALSE
## 25 民生國小 FALSE FALSE
## 26 立農國小 FALSE FALSE
## 27 民生國小 TRUE TRUE
## 28 南湖國小 FALSE FALSE
all.equal(dat$DUP1, dat$DUP2)
## [1] TRUE
– 由於我們已經創造了一個新變數「DUP」,若他為FALSE就是唯一的個案,TRUE的就是重複的
dat.clean = dat[dat$DUP1==FALSE,]
write.csv(dat.clean, "data2_1 clean.csv")
write.csv(dat.clean, "data2_1 clean.csv", fileEncoding = 'CP950', row.names = FALSE, quote = FALSE)
write.csv(dat.clean[,1:9], "data2_1 clean.csv", fileEncoding = 'CP950', row.names = FALSE, quote = FALSE)
– 這是那些測站在早上7點的時候所測得的汙染物濃度
– 一樣先讀取檔案後再檢查一下
dat2 = read.csv("data2_2.csv", header = TRUE, fileEncoding = 'CP950') #請將路徑改為自己放置的位置
dat2$DUP1 = duplicated(dat2)
dat2.clean = dat2[dat2$DUP1 == FALSE,]
– 我們再重新讀一次兩個檔案,並且直接清理而不留下DUP變數
dat1 = read.csv("data2_1.csv", header = TRUE, fileEncoding = 'CP950') #請將路徑改為自己放置的位置
dat1.clean = dat1[duplicated(dat1)==FALSE,]
head(dat1.clean)
## time device_id s_d0 s_t0 s_h0 lat lon
## 1 2016-04-25 04:42:16 28C2DDDD4505 17 29.75 77 24.99643 121.5483
## 2 2016-04-25 05:25:11 28C2DDDD450C 27 27.50 87 25.05221 121.5637
## 3 2016-04-25 05:23:57 28C2DDDD4534 27 30.00 65 25.08374 121.5070
## 4 2016-04-25 05:25:13 28C2DDDD47C6 30 26.12 93 25.01992 121.5305
## 5 2016-04-25 05:24:48 28C2DDDD4234 30 28.00 83 25.00244 121.5519
## 7 2016-04-25 05:24:04 28C2DDDD455E 30 28.12 96 25.02298 121.4993
## school time2
## 1 景興國小 2016/4/25 5:30
## 2 西松國小 2016/4/25 5:30
## 3 葫蘆國小 2016/4/25 5:30
## 4 溫州國宅 2016/4/25 5:30
## 5 興隆國小 2016/4/25 5:30
## 7 萬大國小 2016/4/25 5:30
dat2 = read.csv("data2_2.csv", header = TRUE, fileEncoding = 'CP950') #請將路徑改為自己放置的位置
dat2.clean = dat2[duplicated(dat2)==FALSE,]
head(dat2.clean)
## time device_id s_d0 s_t0 s_h0 lat lon
## 1 2016-04-25 06:55:23 28C2DDDD4591 33 26.75 91 25.00070 121.5754
## 2 2016-04-25 06:55:28 28C2DDDD41EB 41 27.87 87 25.11949 121.5050
## 3 2016-04-25 06:55:36 28C2DDDD4598 41 28.50 88 25.06126 121.5111
## 4 2016-04-25 06:55:59 28C2DDDD452E 42 26.62 88 25.03944 121.5462
## 5 2016-04-25 06:56:04 28C2DDDD4372 42 28.75 85 25.06035 121.5906
## 6 2016-04-25 06:56:18 28C2DDDD4338 38 26.62 89 25.14918 121.5242
## school time2
## 1 博嘉國小 2016/4/25 7:00
## 2 立農國小 2016/4/25 7:00
## 3 永樂國小 2016/4/25 7:00
## 4 私立復興實驗中學 2016/4/25 7:00
## 5 潭美國小 2016/4/25 7:00
## 6 泉源國小 2016/4/25 7:00
merge.dat = merge(dat1.clean, dat2.clean, by = "school", all = TRUE)
head(merge.dat)
## school time.x device_id.x s_d0.x s_t0.x
## 1 仁愛國小 2016-04-25 05:23:46 28C2DDDD456A 37 27.87
## 2 內湖國小 2016-04-25 05:25:32 28C2DDDD436D 42 27.62
## 3 南湖國小 2016-04-25 05:25:48 28C2DDDD434D 43 28.50
## 4 古亭國小 2016-04-25 05:24:42 28C2DDDD41C0 33 27.87
## 5 台北市立大學附設實驗國小 2016-04-25 05:25:16 28C2DDDD4588 33 27.75
## 6 吉林國小 2016-04-25 05:26:20 28C2DDDD458F 38 28.62
## s_h0.x lat.x lon.x time2.x time.y device_id.y
## 1 89 25.03577 121.5524 2016/4/25 5:30 2016-04-25 06:58:42 28C2DDDD456A
## 2 89 25.07913 121.5802 2016/4/25 5:30 2016-04-25 06:56:22 28C2DDDD436D
## 3 88 25.06870 121.6120 2016/4/25 5:30 <NA> <NA>
## 4 100 25.02058 121.5288 2016/4/25 5:30 <NA> <NA>
## 5 88 25.03579 121.5135 2016/4/25 5:30 <NA> <NA>
## 6 87 25.05472 121.5296 2016/4/25 5:30 <NA> <NA>
## s_d0.y s_t0.y s_h0.y lat.y lon.y time2.y
## 1 39 28.00 87 25.03577 121.5524 2016/4/25 7:00
## 2 40 27.12 90 25.07913 121.5802 2016/4/25 7:00
## 3 NA NA NA NA NA <NA>
## 4 NA NA NA NA NA <NA>
## 5 NA NA NA NA NA <NA>
## 6 NA NA NA NA NA <NA>
dat1.simple = dat1.clean[,c("s_d0", "school")]
dat2.simple = dat2.clean[,c("s_d0", "school")]
simple.merge.dat = merge(dat1.simple, dat2.simple, by = "school", all = TRUE)
– 注意,當等號的左右邊同時出現simple.merge.dat時,舊的simple.merge.dat將會在運算後永遠消失,而新的simple.merge.dat將會取代他
dat = read.csv("data2_3.csv", header = TRUE, fileEncoding = 'CP950') #請將路徑改為自己放置的位置
dat.clean = dat[duplicated(dat)==FALSE,]
dat.simple = dat.clean[,c("s_d0", "school")]
simple.merge.dat = merge(simple.merge.dat, dat.simple, by = "school", all = TRUE) #注意這行
– 請在再點這裡下載最後一個檔案
下面這串函數可以依序慢慢合併檔案,但是我希望大家把他改寫成迴圈形式,這樣我們才有可能一次處理上千個檔案
這邊會用到新函數「paste()」
# 這邊先利用前兩個檔案建立simple.merge.dat
dat1 = read.csv("data2_1.csv", header = TRUE, fileEncoding = 'CP950')
dat1.clean = dat1[duplicated(dat1)==FALSE,]
dat1.simple = dat1.clean[,c("s_d0", "school")]
dat2 = read.csv("data2_2.csv", header = TRUE, fileEncoding = 'CP950')
dat2.clean = dat2[duplicated(dat2)==FALSE,]
dat2.simple = dat2.clean[,c("s_d0", "school")]
simple.merge.dat = merge(dat1.simple, dat2.simple, by = "school", all = TRUE)
# 從這邊開始,我們要不斷寫入新的檔案來逐步擴增simple.merge.dat
i = 3
dat = read.csv(paste("data2_", i, ".csv", sep = ""), header = TRUE, fileEncoding = 'CP950')
dat.clean = dat[duplicated(dat)==FALSE,]
dat.simple = dat.clean[,c("s_d0", "school")]
simple.merge.dat = merge(simple.merge.dat, dat.simple, by = "school", all = TRUE) #注意這行
i = 4
dat = read.csv(paste("data2_", i, ".csv", sep = ""), header = TRUE, fileEncoding = 'CP950')
dat.clean = dat[duplicated(dat)==FALSE,]
dat.simple = dat.clean[,c("s_d0", "school")]
simple.merge.dat = merge(simple.merge.dat, dat.simple, by = "school", all = TRUE) #注意這行
# 展示前6列
head(simple.merge.dat)
## school s_d0.x s_d0.y s_d0.x s_d0.y
## 1 仁愛國小 37 39 45 47
## 2 內湖國小 42 40 41 40
## 3 南湖國小 43 NA 46 44
## 4 古亭國小 33 NA 37 55
## 5 台北市立大學附設實驗國小 33 NA 49 55
## 6 吉林國小 38 NA 40 40
dat1 = read.csv("data2_1.csv", header = TRUE, fileEncoding = 'CP950')
dat1.clean = dat1[duplicated(dat1)==FALSE,]
dat1.simple = dat1.clean[,c("s_d0", "school")]
dat2 = read.csv("data2_2.csv", header = TRUE, fileEncoding = 'CP950')
dat2.clean = dat2[duplicated(dat2)==FALSE,]
dat2.simple = dat2.clean[,c("s_d0", "school")]
simple.merge.dat = merge(dat1.simple, dat2.simple, by = "school", all = TRUE)
for (i in 3:4) {
dat = read.csv(paste("data2_", i, ".csv", sep = ""), header = TRUE, fileEncoding = 'CP950')
dat.clean = dat[duplicated(dat)==FALSE,]
dat.simple = dat.clean[,c("s_d0", "school")]
simple.merge.dat = merge(simple.merge.dat, dat.simple, by = "school", all = TRUE)
}
colnames(simple.merge.dat) = c('school', paste('s_d0.', 1:4, sep = ""))
head(simple.merge.dat)
## school s_d0.1 s_d0.2 s_d0.3 s_d0.4
## 1 仁愛國小 37 39 45 47
## 2 內湖國小 42 40 41 40
## 3 南湖國小 43 NA 46 44
## 4 古亭國小 33 NA 37 55
## 5 台北市立大學附設實驗國小 33 NA 49 55
## 6 吉林國小 38 NA 40 40